Space Tourism¶

In [1]:
base_dir = '../'

1. Data pre-processing¶

Importing libraries¶

In [78]:
import numpy as np
import pandas as pd
import shap as shap
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from uszipcode import SearchEngine
from pandas_profiling import ProfileReport


# settings to display all columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

Reading data¶

In [79]:
data = pd.read_csv(base_dir + 'data/modelingData_SpaceTourism_modified.csv')
data['zip'] = data['zip'].astype(int)
data.head()
Out[79]:
id choice zip year_birth gender annual_income household_annual_income number_vehicles level_education work_type children_home household_type status_in_household type_residence housing_tenure_type origin race citizenship risk_activities_sports price_attribute availability probability_fatality training number_passengers takeoff_location price_dollars alternative
0 1 0 55906 1976 1 8 9 1 7 1 2 1 1 1 1 2 3 1 1 2 0 1 0 0 0 453125.0 suborbital
1 1 0 55906 1976 1 8 9 1 7 1 2 1 1 1 1 2 3 1 1 2 0 0 1 0 0 453125.0 orbital
2 1 1 55906 1976 1 8 9 1 7 1 2 1 1 1 1 2 3 1 1 0 0 0 0 1 0 3750.0 moon_trip
3 1 0 55906 1976 1 8 9 1 7 1 2 1 1 1 1 2 3 1 1 -1 -1 -1 -1 -1 -1 0.0 not_travel
4 2 1 55906 1976 1 8 9 1 7 1 2 1 1 1 1 2 3 1 1 2 0 0 1 0 0 453125.0 suborbital
In [80]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8640 entries, 0 to 8639
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       8640 non-null   int64  
 1   choice                   8640 non-null   int64  
 2   zip                      8640 non-null   int64  
 3   year_birth               8640 non-null   int64  
 4   gender                   8640 non-null   int64  
 5   annual_income            8640 non-null   int64  
 6   household_annual_income  8640 non-null   int64  
 7   number_vehicles          8640 non-null   int64  
 8   level_education          8640 non-null   int64  
 9   work_type                8640 non-null   int64  
 10  children_home            8640 non-null   int64  
 11  household_type           8640 non-null   int64  
 12  status_in_household      8640 non-null   int64  
 13  type_residence           8640 non-null   int64  
 14  housing_tenure_type      8640 non-null   int64  
 15  origin                   8640 non-null   int64  
 16  race                     8640 non-null   int64  
 17  citizenship              8640 non-null   int64  
 18  risk_activities_sports   8640 non-null   int64  
 19  price_attribute          8640 non-null   int64  
 20  availability             8640 non-null   int64  
 21  probability_fatality     8640 non-null   int64  
 22  training                 8640 non-null   int64  
 23  number_passengers        8640 non-null   int64  
 24  takeoff_location         8640 non-null   int64  
 25  price_dollars            8640 non-null   float64
 26  alternative              8640 non-null   object 
dtypes: float64(1), int64(25), object(1)
memory usage: 1.8+ MB

Feature Engineering¶

In [81]:
cleanup_nums = {
    'gender': {
        1: 'male',
        2: 'female'
    },
    #! ADD ORDINAL CATEGORIES
    'annual_income': {
        1: 'a_less_than_10k',
        2: 'b_10k_15k',
        3: 'c_15k_25k',
        4: 'd_25k_35k',
        5: 'e_35k_50k',
        6: 'f_50k_75k',
        7: 'g_75k_100k',
        8: 'h_100k_150k',
        9: 'i_150k_200k',
        10: 'j_more_than_200k'
    },
    #! ADD ORDINAL CATEGORIES
    'household_annual_income': {
        1: 'a_less_than_10k',
        2: 'b_10k_15k',
        3: 'c_15k_25k',
        4: 'd_25k_35k',
        5: 'e_35k_50k',
        6: 'f_50k_75k',
        7: 'g_75k_100k',
        8: 'h_100k_150k',
        9: 'i_150k_200k',
        10: 'j_more_than_200k'
    },
    #! ADD ORDINAL CATEGORIES
    'number_vehicles': {
        1: '1_car',
        2: '2_cars',
        3: '3_cars',
        4: '4_or_more_cars'
    },
    #! ADD ORDINAL CATEGORIES
    'level_education': {
        1: 'less_9th_grade',
        2: '9th_12th_grade_nodiploma',
        3: 'high_school_graduate',
        4: 'some_college',
        5: 'associate_degree',
        6: 'bachelor_degree',
        7: 'grad_prof_degree'
    },
    'work_type': {
        1: 'private',
        2: 'government',
        3: 'self_employed',
        4: 'unpaid_work'
    },
    #! ADD ORDINAL CATEGORIES
    'children_home': {
        1: '0_children',
        2: '1_child',
        3: '2_children',
        4: '3_children',
        5: '4_children',
        6: '5_children_or_more'
    },
    'household_type': {
        1: '1_couple_with_children',
        2: '0_couple_no_children',
        3: '1_male_children',
        4: '0_male_no_children',
        5: '1_female_children',
        6: '0_female_no_children',
        7: '0_alone',
        8: '2_other'
    },
    'status_in_household': {
        1: 'head',
        2: 'spouse',
        3: 'child',
        4: 'other'
    },
    'type_residence': {
        1: 'house',
        2: 'apartment',
        3: 'other'
    },
    'housing_tenure_type': {
        1: 'own',
        2: 'rent'
    },
    'origin': {
        1: 'hispanic',
        2: 'non_hispanic'
    },
    'race': {
        1: 'white',
        2: 'black',
        3: 'asian',
        4: 'hawaian_pacific',
        5: 'other_race',
        6: 'two_or_more_races'
    },
    'citizenship': {
        1: 'us_citizen',
        2: 'other'
    },
    #! ADD ORDINAL CATEGORIES
    'risk_activities_sports': {
        1: 'never',
        2: 'rarely',
        3: 'often'
    },
    'price_attribute': {
        0: '3_perc_annual_income',
        1: '50_perc_annual_income',
        2: '362_perc_annual_income',
        -1: np.nan
    },
    'availability': {
        0: 'immediate',
        1: 'in_5_years',
        -1: np.nan
    },
    'probability_fatality': {
        0: 0.5,
        1: 7.5,
        -1: np.nan
    },
    'training': {
        0: 'no',
        1: 'yes',
        -1: np.nan
    },
    'number_passengers': {
        0: 'one',
        1: 'more_than_one',
        -1: np.nan
    },
    'takeoff_location': {
        0: 'usa',
        1: 'other',
        -1: np.nan
    }
}
In [82]:
data = data.replace(cleanup_nums)

data.head()
Out[82]:
id choice zip year_birth gender annual_income household_annual_income number_vehicles level_education work_type children_home household_type status_in_household type_residence housing_tenure_type origin race citizenship risk_activities_sports price_attribute availability probability_fatality training number_passengers takeoff_location price_dollars alternative
0 1 0 55906 1976 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 362_perc_annual_income immediate 7.5 no one usa 453125.0 suborbital
1 1 0 55906 1976 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 362_perc_annual_income immediate 0.5 yes one usa 453125.0 orbital
2 1 1 55906 1976 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 3_perc_annual_income immediate 0.5 no more_than_one usa 3750.0 moon_trip
3 1 0 55906 1976 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never NaN NaN NaN NaN NaN NaN 0.0 not_travel
4 2 1 55906 1976 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 362_perc_annual_income immediate 0.5 yes one usa 453125.0 suborbital
In [83]:
# Creating a column with 4 categories for each alternative

list_alt = [1, 2, 3, 4]
data['list_alt'] = np.tile(list_alt, len(data) // len(list_alt))
In [84]:
# Creating four dataframes with the different alternatives

data_suborbital = data[data['list_alt'] == 1]
data_orbital = data[data['list_alt'] == 2]
data_moon_trip = data[data['list_alt'] == 3]
data_not_travel = data[data['list_alt'] == 4]

data_suborbital = data_suborbital.rename(
    columns={
        'choice': 'suborbital',
        'price_attribute': 'price_attribute_suborbital',
        'availability': 'availability_suborbital',
        'probability_fatality': 'probability_fatality_suborbital',
        'training': 'training_suborbital',
        'number_passengers': 'number_passengers_suborbital',
        'takeoff_location': 'takeoff_location_suborbital',
        'price_dollars': 'price_dollars_suborbital',
    })
data_orbital = data_orbital.rename(
    columns={
        'choice': 'orbital',
        'price_attribute': 'price_attribute_orbital',
        'availability': 'availability_orbital',
        'probability_fatality': 'probability_fatality_orbital',
        'training': 'training_orbital',
        'number_passengers': 'number_passengers_orbital',
        'takeoff_location': 'takeoff_location_orbital',
        'price_dollars': 'price_dollars_orbital',
    })
data_moon_trip = data_moon_trip.rename(
    columns={
        'choice': 'moon_trip',
        'price_attribute': 'price_attribute_moon_trip',
        'availability': 'availability_moon_trip',
        'probability_fatality': 'probability_fatality_moon_trip',
        'training': 'training_moon_trip',
        'number_passengers': 'number_passengers_moon_trip',
        'takeoff_location': 'takeoff_location_moon_trip',
        'price_dollars': 'price_dollars_moon_trip',
    })
data_not_travel = data_not_travel.rename(
    columns={
        'choice': 'not_travel',
        'price_attribute': 'price_attribute_not_travel',
        'availability': 'availability_not_travel',
        'probability_fatality': 'probability_fatality_not_travel',
        'training': 'training_not_travel',
        'number_passengers': 'number_passengers_not_travel',
        'takeoff_location': 'takeoff_location_not_travel',
        'price_dollars': 'price_dollars_not_travel',
    })
In [85]:
# Merging the dataframes
merged_data = pd.merge(pd.merge(
    pd.merge(data_suborbital,
             data_orbital[[
                 'id', 'orbital', 'price_attribute_orbital',
                 'availability_orbital', 'probability_fatality_orbital',
                 'training_orbital', 'number_passengers_orbital',
                 'takeoff_location_orbital', 'price_dollars_orbital'
             ]],
             on='id',
             suffixes=('_suborbital', '_suborbital')),
    data_moon_trip[[
        'id', 'moon_trip', 'price_attribute_moon_trip',
        'availability_moon_trip', 'probability_fatality_moon_trip',
        'training_moon_trip', 'number_passengers_moon_trip',
        'takeoff_location_moon_trip', 'price_dollars_moon_trip'
    ]],
    on='id',
    suffixes=('', '_moon_trip')),
                       data_not_travel[['id', 'not_travel']],
                       on='id',
                       suffixes=('', '_not_travel'))
merged_data.head()

# Creating the column with the choice
merged_data['choice'] = merged_data[[
    'suborbital', 'orbital', 'moon_trip', 'not_travel'
]].idxmax(axis=1)

# Putting the choice in first place
choice_column = merged_data.pop('choice')
merged_data.insert(0, 'choice', choice_column)
In [86]:
# Changing year_birth for age
merged_data['age'] = 2022 - merged_data['year_birth']

# Transforming age to generation
merged_data['generation_age'] = np.where(
    merged_data['year_birth'] < 1928, 'greater_generation',
    np.where(
        merged_data['year_birth'] < 1946, 'traditionalist',
        np.where(
            merged_data['year_birth'] < 1964, 'baby_boomers',
            np.where(
                merged_data['year_birth'] < 1976, 'gen_x',
                np.where(merged_data['year_birth'] < 1995, 'millenials',
                         'centennials')))))

merged_data = merged_data.drop(columns=['year_birth'])
In [87]:
merged_data['generation_age'].value_counts().sort_index()
Out[87]:
baby_boomers     216
centennials      312
gen_x            492
millenials      1140
Name: generation_age, dtype: int64
In [88]:
def city(zip):
    try:
        engine = SearchEngine()
        return engine.by_zipcode(zip).major_city
    except:
        return np.nan


def state(zip):
    try:
        engine = SearchEngine()
        return engine.by_zipcode(zip).state
    except:
        return np.nan
In [89]:
# Assigning the zipcode to the city
engine = SearchEngine()
merged_data['city'] = merged_data['zip'].apply(city)
merged_data['state'] = merged_data['zip'].apply(state)
In [90]:
# Aggregating states using https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#/media/File:Census_Regions_and_Division_of_the_United_States.svg
dict_states = {
    'west': [
        'AZ', 'CA', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'OR', 'WA', 'AK',
        'HI'
    ],
    'midwest':
    ['ND', 'SD', 'NE', 'KS', 'MN', 'IA', 'MO', 'WI', 'IL', 'IN', 'MI', 'OH'],
    'south': [
        'TX', 'LA', 'MS', 'AL', 'AR', 'OK', 'FL', 'GA', 'KY', 'NC', 'SC', 'TN',
        'VA', 'WV', 'DC', 'MD', 'DE'
    ],
    'northeast': ['NY', 'PA', 'NJ', 'CT', 'RI', 'MA', 'VT', 'NH', 'ME']
}


def argcontains(item):
    for i, v in dict_states.items():
        if item in v:
            return i
    return np.nan


merged_data['region'] = merged_data['state'].map(argcontains)
In [91]:
# Dropping the unnecessary columns
merged_data = merged_data.drop(columns=[
    'id', 'alternative', 'list_alt', 'suborbital', 'orbital', 'moon_trip',
    'not_travel'
])
In [92]:
merged_data.head()
Out[92]:
choice zip gender annual_income household_annual_income number_vehicles level_education work_type children_home household_type status_in_household type_residence housing_tenure_type origin race citizenship risk_activities_sports price_attribute_suborbital availability_suborbital probability_fatality_suborbital training_suborbital number_passengers_suborbital takeoff_location_suborbital price_dollars_suborbital price_attribute_orbital availability_orbital probability_fatality_orbital training_orbital number_passengers_orbital takeoff_location_orbital price_dollars_orbital price_attribute_moon_trip availability_moon_trip probability_fatality_moon_trip training_moon_trip number_passengers_moon_trip takeoff_location_moon_trip price_dollars_moon_trip age generation_age city state region
0 moon_trip 55906 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 362_perc_annual_income immediate 7.5 no one usa 453125.0 362_perc_annual_income immediate 0.5 yes one usa 453125.0 3_perc_annual_income immediate 0.5 no more_than_one usa 3750.0 46 millenials Rochester MN midwest
1 suborbital 55906 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 362_perc_annual_income immediate 0.5 yes one usa 453125.0 362_perc_annual_income immediate 0.5 yes more_than_one other 453125.0 362_perc_annual_income in_5_years 7.5 no one usa 453125.0 46 millenials Rochester MN midwest
2 moon_trip 55906 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 50_perc_annual_income in_5_years 7.5 yes one usa 62500.0 362_perc_annual_income in_5_years 0.5 no one other 453125.0 50_perc_annual_income immediate 0.5 no more_than_one other 62500.0 46 millenials Rochester MN midwest
3 moon_trip 55906 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 3_perc_annual_income in_5_years 7.5 no more_than_one usa 3750.0 50_perc_annual_income immediate 7.5 no one other 62500.0 50_perc_annual_income in_5_years 0.5 yes one usa 62500.0 46 millenials Rochester MN midwest
4 suborbital 55906 male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 50_perc_annual_income in_5_years 0.5 yes more_than_one usa 62500.0 50_perc_annual_income immediate 0.5 no more_than_one other 62500.0 3_perc_annual_income immediate 7.5 yes more_than_one usa 3750.0 46 millenials Rochester MN midwest
In [93]:
data = merged_data.copy()
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2160 entries, 0 to 2159
Data columns (total 43 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   choice                           2160 non-null   object 
 1   zip                              2160 non-null   int64  
 2   gender                           2160 non-null   object 
 3   annual_income                    2160 non-null   object 
 4   household_annual_income          2160 non-null   object 
 5   number_vehicles                  2160 non-null   object 
 6   level_education                  2160 non-null   object 
 7   work_type                        2160 non-null   object 
 8   children_home                    2160 non-null   object 
 9   household_type                   2160 non-null   object 
 10  status_in_household              2160 non-null   object 
 11  type_residence                   2160 non-null   object 
 12  housing_tenure_type              2160 non-null   object 
 13  origin                           2160 non-null   object 
 14  race                             2160 non-null   object 
 15  citizenship                      2160 non-null   object 
 16  risk_activities_sports           2160 non-null   object 
 17  price_attribute_suborbital       2160 non-null   object 
 18  availability_suborbital          2160 non-null   object 
 19  probability_fatality_suborbital  2160 non-null   float64
 20  training_suborbital              2160 non-null   object 
 21  number_passengers_suborbital     2160 non-null   object 
 22  takeoff_location_suborbital      2160 non-null   object 
 23  price_dollars_suborbital         2160 non-null   float64
 24  price_attribute_orbital          2160 non-null   object 
 25  availability_orbital             2160 non-null   object 
 26  probability_fatality_orbital     2160 non-null   float64
 27  training_orbital                 2160 non-null   object 
 28  number_passengers_orbital        2160 non-null   object 
 29  takeoff_location_orbital         2160 non-null   object 
 30  price_dollars_orbital            2160 non-null   float64
 31  price_attribute_moon_trip        2160 non-null   object 
 32  availability_moon_trip           2160 non-null   object 
 33  probability_fatality_moon_trip   2160 non-null   float64
 34  training_moon_trip               2160 non-null   object 
 35  number_passengers_moon_trip      2160 non-null   object 
 36  takeoff_location_moon_trip       2160 non-null   object 
 37  price_dollars_moon_trip          2160 non-null   float64
 38  age                              2160 non-null   int64  
 39  generation_age                   2160 non-null   object 
 40  city                             2076 non-null   object 
 41  state                            2076 non-null   object 
 42  region                           2076 non-null   object 
dtypes: float64(6), int64(2), object(35)
memory usage: 742.5+ KB
In [94]:
# The following zipcodes do not have information about city and state

data_without_region = data[data['region'].isnull()]
data_without_region['zip'].unique()
Out[94]:
array([10429, 70028, 19632, 53427, 54321])
In [95]:
data['region'].value_counts(normalize=True, dropna=False)
Out[95]:
west         0.361111
south        0.333333
northeast    0.166667
midwest      0.100000
NaN          0.038889
Name: region, dtype: float64

Not too imbalanced.

In [96]:
# Creating additional features because Rodrigo suggested me to do so

## probability_fatality
data['average_probability_fatality'] = data[[
    'probability_fatality_orbital', 'probability_fatality_suborbital',
    'probability_fatality_moon_trip'
]].mean(axis=1)
data['delta_probability_fatality_orbital'] = data[
    'average_probability_fatality'] - data['probability_fatality_orbital']
data['delta_probability_fatality_suborbital'] = data[
    'average_probability_fatality'] - data['probability_fatality_suborbital']
data['delta_probability_fatality_moon_trip'] = data[
    'average_probability_fatality'] - data['probability_fatality_moon_trip']

## price_attribute
data['average_price_dollars'] = data[[
    'price_dollars_orbital', 'price_dollars_suborbital',
    'price_dollars_moon_trip'
]].mean(axis=1)
data['delta_price_dollars_orbital'] = data['average_price_dollars'] - data[
    'price_dollars_orbital']
data['delta_price_dollars_suborbital'] = data['average_price_dollars'] - data[
    'price_dollars_suborbital']
data['delta_price_dollars_moon_trip'] = data['average_price_dollars'] - data[
    'price_dollars_moon_trip']

# Dropping features because Rodrigo suggested me to do so
data = data.drop(columns=[
    'zip', 'probability_fatality_orbital', 'probability_fatality_suborbital',
    'probability_fatality_moon_trip', 'price_dollars_orbital',
    'price_dollars_suborbital', 'price_dollars_moon_trip'
],
                 axis=1)

Quality analysis¶

In [97]:
print('Data shape: ', data.shape)
data.head()
Data shape:  (2160, 44)
Out[97]:
choice gender annual_income household_annual_income number_vehicles level_education work_type children_home household_type status_in_household type_residence housing_tenure_type origin race citizenship risk_activities_sports price_attribute_suborbital availability_suborbital training_suborbital number_passengers_suborbital takeoff_location_suborbital price_attribute_orbital availability_orbital training_orbital number_passengers_orbital takeoff_location_orbital price_attribute_moon_trip availability_moon_trip training_moon_trip number_passengers_moon_trip takeoff_location_moon_trip age generation_age city state region average_probability_fatality delta_probability_fatality_orbital delta_probability_fatality_suborbital delta_probability_fatality_moon_trip average_price_dollars delta_price_dollars_orbital delta_price_dollars_suborbital delta_price_dollars_moon_trip
0 moon_trip male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 362_perc_annual_income immediate no one usa 362_perc_annual_income immediate yes one usa 3_perc_annual_income immediate no more_than_one usa 46 millenials Rochester MN midwest 2.833333 2.333333 -4.666667 2.333333 303333.333333 -149791.666667 -149791.666667 299583.333333
1 suborbital male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 362_perc_annual_income immediate yes one usa 362_perc_annual_income immediate yes more_than_one other 362_perc_annual_income in_5_years no one usa 46 millenials Rochester MN midwest 2.833333 2.333333 2.333333 -4.666667 453125.000000 0.000000 0.000000 0.000000
2 moon_trip male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 50_perc_annual_income in_5_years yes one usa 362_perc_annual_income in_5_years no one other 50_perc_annual_income immediate no more_than_one other 46 millenials Rochester MN midwest 2.833333 2.333333 -4.666667 2.333333 192708.333333 -260416.666667 130208.333333 130208.333333
3 moon_trip male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 3_perc_annual_income in_5_years no more_than_one usa 50_perc_annual_income immediate no one other 50_perc_annual_income in_5_years yes one usa 46 millenials Rochester MN midwest 5.166667 -2.333333 -2.333333 4.666667 42916.666667 -19583.333333 39166.666667 -19583.333333
4 suborbital male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 50_perc_annual_income in_5_years yes more_than_one usa 50_perc_annual_income immediate no more_than_one other 3_perc_annual_income immediate yes more_than_one usa 46 millenials Rochester MN midwest 2.833333 2.333333 2.333333 -4.666667 42916.666667 -19583.333333 -19583.333333 39166.666667

Age¶

In [98]:
# Analyzing status_in_household per age
data[['status_in_household',
      'age']][data['status_in_household'] == 'child'].describe()
Out[98]:
age
count 108.000000
mean 42.888889
std 16.422764
min 21.000000
25% 28.000000
50% 41.000000
75% 57.000000
max 67.000000
In [99]:
# Analyzing status_in_household per age
data[['status_in_household',
      'age']].groupby('status_in_household').mean().plot(kind='bar',
                                                         figsize=(20, 5),
                                                         fontsize=20)
Out[99]:
<AxesSubplot:xlabel='status_in_household'>
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif

Annual Income¶

In [100]:
# Analyzing annual_income against household_annual_income
income = pd.crosstab(data['annual_income'], data['household_annual_income'])

a = income.plot(kind='bar', rot=0, figsize=(30, 10), fontsize=15)
a.legend(bbox_to_anchor=(1, 1.02), loc='best', fontsize=20)
a.set_xlabel('annual_income', fontsize=20)
a.set_ylabel('household_annual_income', fontsize=20)
Out[100]:
Text(0, 0.5, 'household_annual_income')

household_type vs. children_home¶

In [101]:
# Analyzing household_type against children_home
children_home = pd.crosstab(data['household_type'], data['children_home'])

a = children_home.plot(kind='bar', rot=0, figsize=(30, 10), fontsize=15)
a.legend(bbox_to_anchor=(1, 1.02), loc='best', fontsize=20)
a.set_xlabel('household_type', fontsize=20)
a.set_ylabel('children_home', fontsize=20)
Out[101]:
Text(0, 0.5, 'children_home')

Removing akward outliers¶

In [102]:
data_filtered = data.copy()
print("Shape before filtering: ", data_filtered.shape)

# Removing rows where house_annual_income is lower than annual_income
data_filtered = data_filtered.drop(data_filtered[
    (data_filtered['annual_income'] == 'b_10k_15k')
    & (data_filtered['household_annual_income'] == 'a_less_than_10k')].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['annual_income'] == 'c_15k_25k') & (
        (data_filtered['household_annual_income'] == 'a_less_than_10k')
        | (data_filtered['household_annual_income'] == 'b_10k_15k'))].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['annual_income'] == 'd_25k_35k') & (
        (data_filtered['household_annual_income'] == 'a_less_than_10k')
        | (data_filtered['household_annual_income'] == 'b_10k_15k')
        | (data_filtered['household_annual_income'] == 'c_15k_25k'))].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['annual_income'] == 'e_35k_50k') & (
        (data_filtered['household_annual_income'] == 'a_less_than_10k')
        | (data_filtered['household_annual_income'] == 'b_10k_15k')
        | (data_filtered['household_annual_income'] == 'c_15k_25k')
        | (data_filtered['household_annual_income'] == 'd_25k_35k'))].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['annual_income'] == 'f_50k_75k') & (
        (data_filtered['household_annual_income'] == 'a_less_than_10k')
        | (data_filtered['household_annual_income'] == 'b_10k_15k')
        | (data_filtered['household_annual_income'] == 'c_15k_25k')
        | (data_filtered['household_annual_income'] == 'd_25k_35k')
        | (data_filtered['household_annual_income'] == 'e_35k_50k'))].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['annual_income'] == 'g_75k_100k') & (
        (data_filtered['household_annual_income'] == 'a_less_than_10k')
        | (data_filtered['household_annual_income'] == 'b_10k_15k')
        | (data_filtered['household_annual_income'] == 'c_15k_25k')
        | (data_filtered['household_annual_income'] == 'd_25k_35k')
        | (data_filtered['household_annual_income'] == 'e_35k_50k')
        | (data_filtered['household_annual_income'] == 'f_50k_75k'))].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['annual_income'] == 'h_100k_150k') & (
        (data_filtered['household_annual_income'] == 'a_less_than_10k')
        | (data_filtered['household_annual_income'] == 'b_10k_15k')
        | (data_filtered['household_annual_income'] == 'c_15k_25k')
        | (data_filtered['household_annual_income'] == 'd_25k_35k')
        | (data_filtered['household_annual_income'] == 'e_35k_50k')
        | (data_filtered['household_annual_income'] == 'f_50k_75k')
        | (data_filtered['household_annual_income'] == 'g_75k_100k'))].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['annual_income'] == 'i_150k_200k') & (
        (data_filtered['household_annual_income'] == 'a_less_than_10k')
        | (data_filtered['household_annual_income'] == 'b_10k_15k')
        | (data_filtered['household_annual_income'] == 'c_15k_25k')
        | (data_filtered['household_annual_income'] == 'd_25k_35k')
        | (data_filtered['household_annual_income'] == 'e_35k_50k')
        | (data_filtered['household_annual_income'] == 'f_50k_75k')
        | (data_filtered['household_annual_income'] == 'g_75k_100k')
        | (data_filtered['household_annual_income'] == 'h_100k_150k'))].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['annual_income'] == 'j_more_than_200k') & (
        (data_filtered['household_annual_income'] == 'a_less_than_10k')
        | (data_filtered['household_annual_income'] == 'b_10k_15k')
        | (data_filtered['household_annual_income'] == 'c_15k_25k')
        | (data_filtered['household_annual_income'] == 'd_25k_35k')
        | (data_filtered['household_annual_income'] == 'e_35k_50k')
        | (data_filtered['household_annual_income'] == 'f_50k_75k')
        | (data_filtered['household_annual_income'] == 'g_75k_100k')
        | (data_filtered['household_annual_income'] == 'h_100k_150k')
        | (data_filtered['household_annual_income'] == 'i_150k_200k'))].index)

# Removing rows where household_type and children_home do not coincide
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['household_type'] == '0_alone') & (
        (data_filtered['children_home'] == '1_child')
        | (data_filtered['children_home'] == '2_children')
        | (data_filtered['children_home'] == '3_children')
        | (data_filtered['children_home'] == '4_children')
        | (data_filtered['children_home'] == '5_children_or_more'))].index)

data_filtered = data_filtered.drop(data_filtered[
    (data_filtered['household_type'] == '0_couple_no_children')
    & ((data_filtered['children_home'] == '1_child')
       | (data_filtered['children_home'] == '2_children')
       | (data_filtered['children_home'] == '3_children')
       | (data_filtered['children_home'] == '4_children')
       | (data_filtered['children_home'] == '5_children_or_more'))].index)
data_filtered = data_filtered.drop(data_filtered[
    (data_filtered['household_type'] == '0_female_no_children')
    & ((data_filtered['children_home'] == '1_child')
       | (data_filtered['children_home'] == '2_children')
       | (data_filtered['children_home'] == '3_children')
       | (data_filtered['children_home'] == '4_children')
       | (data_filtered['children_home'] == '5_children_or_more'))].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['household_type'] == '0_male_no_children') & (
        (data_filtered['children_home'] == '1_child')
        | (data_filtered['children_home'] == '2_children')
        | (data_filtered['children_home'] == '3_children')
        | (data_filtered['children_home'] == '4_children')
        | (data_filtered['children_home'] == '5_children_or_more'))].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['household_type'] == '1_couple_with_children')
                  & (data_filtered['children_home'] == '0_children')].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['household_type'] == '1_female_children')
                  & (data_filtered['children_home'] == '0_children')].index)
data_filtered = data_filtered.drop(
    data_filtered[(data_filtered['household_type'] == '1_male_children')
                  & (data_filtered['children_home'] == '0_children')].index)

# status_in_household to be excluded in the model

print("Shape after filtering: ", data_filtered.shape)
Shape before filtering:  (2160, 44)
Shape after filtering:  (1908, 44)
In [103]:
data_filtered.describe()
Out[103]:
age average_probability_fatality delta_probability_fatality_orbital delta_probability_fatality_suborbital delta_probability_fatality_moon_trip average_price_dollars delta_price_dollars_orbital delta_price_dollars_suborbital delta_price_dollars_moon_trip
count 1908.000000 1908.000000 1908.000000 1908.000000 1908.000000 1908.000000 1908.000000 1908.000000 1908.000000
mean 41.201258 4.011006 -0.022013 0.011006 0.011006 65917.531447 306.132075 -153.066038 -153.066038
std 12.005651 1.782546 2.694918 3.391547 2.910430 75154.656241 71495.911836 80245.691536 83181.737163
min 21.000000 0.500000 -4.666667 -4.666667 -4.666667 150.000000 -520833.333333 -599166.666667 -599166.666667
25% 31.000000 2.833333 -2.333333 -2.333333 -2.333333 14591.666667 -20970.833333 -20970.833333 -20970.833333
50% 38.000000 5.166667 0.000000 0.000000 0.000000 42466.666667 -391.666667 4700.000000 0.000000
75% 51.000000 5.166667 2.333333 2.333333 2.333333 96354.166667 35950.000000 37612.500000 36458.333333
max 72.000000 7.500000 4.666667 4.666667 4.666667 906250.000000 419416.666667 419416.666667 599166.666667
In [104]:
# Analyzing annual_income against household_annual_income after filtering

income_2 = pd.crosstab(data_filtered['annual_income'],
                       data_filtered['household_annual_income'])

a = income_2.plot(kind='bar', rot=0, figsize=(30, 10), fontsize=15)
a.legend(bbox_to_anchor=(1, 1.02), loc='best', fontsize=20)
a.set_xlabel('annual_income', fontsize=20)
a.set_ylabel('household_annual_income', fontsize=20)
Out[104]:
Text(0, 0.5, 'household_annual_income')
In [105]:
# Analyzing household_type against children_home after filtering
children_home_2 = pd.crosstab(data_filtered['household_type'],
                              data_filtered['children_home'])

a = children_home_2.plot(kind='bar', rot=0, figsize=(30, 10), fontsize=15)
a.legend(bbox_to_anchor=(1, 1.02), loc='best', fontsize=20)
a.set_xlabel('household_type', fontsize=20)
a.set_ylabel('children_home', fontsize=20)
Out[105]:
Text(0, 0.5, 'children_home')

Statistics final dataset¶

In [108]:
profile = ProfileReport(data_filtered, title='Report')
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[108]:

In [109]:
profile.to_file(base_dir + 'data/modelingData_SpaceTourism_strings_v4_Report.html')
Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Saving processed data¶

In [107]:
data_filtered.to_csv(base_dir +
                     'data/modelingData_SpaceTourism_strings_v4.csv',
                     index=False)

2. Modeling¶

In [3]:
data = pd.read_csv(base_dir + 'data/modelingData_SpaceTourism_strings_v4.csv')
In [4]:
data.head()
Out[4]:
choice gender annual_income household_annual_income number_vehicles level_education work_type children_home household_type status_in_household type_residence housing_tenure_type origin race citizenship risk_activities_sports price_attribute_suborbital availability_suborbital training_suborbital number_passengers_suborbital takeoff_location_suborbital price_attribute_orbital availability_orbital training_orbital number_passengers_orbital takeoff_location_orbital price_attribute_moon_trip availability_moon_trip training_moon_trip number_passengers_moon_trip takeoff_location_moon_trip age generation_age city state region average_probability_fatality delta_probability_fatality_orbital delta_probability_fatality_suborbital delta_probability_fatality_moon_trip average_price_dollars delta_price_dollars_orbital delta_price_dollars_suborbital delta_price_dollars_moon_trip
0 moon_trip male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 362_perc_annual_income immediate no one usa 362_perc_annual_income immediate yes one usa 3_perc_annual_income immediate no more_than_one usa 46 millenials Rochester MN midwest 2.833333 2.333333 -4.666667 2.333333 303333.333333 -149791.666667 -149791.666667 299583.333333
1 suborbital male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 362_perc_annual_income immediate yes one usa 362_perc_annual_income immediate yes more_than_one other 362_perc_annual_income in_5_years no one usa 46 millenials Rochester MN midwest 2.833333 2.333333 2.333333 -4.666667 453125.000000 0.000000 0.000000 0.000000
2 moon_trip male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 50_perc_annual_income in_5_years yes one usa 362_perc_annual_income in_5_years no one other 50_perc_annual_income immediate no more_than_one other 46 millenials Rochester MN midwest 2.833333 2.333333 -4.666667 2.333333 192708.333333 -260416.666667 130208.333333 130208.333333
3 moon_trip male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 3_perc_annual_income in_5_years no more_than_one usa 50_perc_annual_income immediate no one other 50_perc_annual_income in_5_years yes one usa 46 millenials Rochester MN midwest 5.166667 -2.333333 -2.333333 4.666667 42916.666667 -19583.333333 39166.666667 -19583.333333
4 suborbital male h_100k_150k i_150k_200k 1_car grad_prof_degree private 1_child 1_couple_with_children head house own non_hispanic asian us_citizen never 50_perc_annual_income in_5_years yes more_than_one usa 50_perc_annual_income immediate no more_than_one other 3_perc_annual_income immediate yes more_than_one usa 46 millenials Rochester MN midwest 2.833333 2.333333 2.333333 -4.666667 42916.666667 -19583.333333 -19583.333333 39166.666667
In [5]:
# Looking different levels of categorical features

for i in data[[
        'number_vehicles', 'level_education', 'children_home',
        'risk_activities_sports', 'generation_age'
]].columns:
    print(i, '\n', data[i].value_counts(normalize=True))
    print('---------------------------------\n')
number_vehicles 2_cars            0.522013
1_car             0.371069
3_cars            0.062893
4_or_more_cars    0.044025
Name: number_vehicles, dtype: float64
---------------------------------

level_education bachelor_degree         0.616352
grad_prof_degree        0.150943
some_college            0.125786
high_school_graduate    0.069182
associate_degree        0.037736
Name: level_education, dtype: float64
---------------------------------

children_home 0_children            0.377358
2_children            0.308176
1_child               0.264151
4_children            0.018868
3_children            0.018868
5_children_or_more    0.012579
Name: children_home, dtype: float64
---------------------------------

risk_activities_sports rarely    0.459119
never     0.358491
often     0.182390
Name: risk_activities_sports, dtype: float64
---------------------------------

generation_age millenials      0.547170
gen_x           0.245283
centennials     0.119497
baby_boomers    0.088050
Name: generation_age, dtype: float64
---------------------------------

In [6]:
from pycaret.classification import *

exp1 = setup(
    data=data,
    target='choice',
    session_id=42,
    normalize=True,
    #normalize_method='minmax',
    #transformation=True,
    #pca = True,
    ignore_features=['city', 'state', 'generation_age', 'status_in_household'],
    ordinal_features={
        'annual_income': [
            'a_less_than_10k', 'b_10k_15k', 'c_15k_25k', 'd_25k_35k',
            'e_35k_50k', 'f_50k_75k', 'g_75k_100k', 'h_100k_150k',
            'i_150k_200k', 'j_more_than_200k'
        ],
        'household_annual_income': [
            'a_less_than_10k', 'b_10k_15k', 'c_15k_25k', 'd_25k_35k',
            'e_35k_50k', 'f_50k_75k', 'g_75k_100k', 'h_100k_150k',
            'i_150k_200k', 'j_more_than_200k'
        ],
        'number_vehicles': ['1_car', '2_cars', '3_cars', '4_or_more_cars'],
        'level_education': [
            # 'less_9th_grade', '9th_12th_grade_nodiploma', # Nobody in those ranges
            'high_school_graduate',
            'some_college',
            'associate_degree',
            'bachelor_degree',
            'grad_prof_degree'
        ],
        'children_home': [
            '0_children', '1_child', '2_children', '3_children', '4_children',
            '5_children_or_more'
        ],
        'risk_activities_sports': ['never', 'rarely', 'often'],
        # 'generation_age':
        # ['baby_boomers', 'gen_x', 'millenials', 'centennials']
    },
    train_size=0.8,
    use_gpu=True,
    combine_rare_levels=True,  # Added
    remove_multicollinearity=True,  # Added
    unknown_categorical_method='most_frequent',  # Added
    remove_outliers=True,  # Added
    fix_imbalance=True,
    data_split_stratify=True,
    fold_strategy='stratifiedkfold',
    silent=True,
    log_experiment=True,
    experiment_name='first_exp',
)
  Description Value
0 session_id 42
1 Target choice
2 Target Type Multiclass
3 Label Encoded moon_trip: 0, not_travel: 1, orbital: 2, suborbital: 3
4 Original Data (1908, 44)
5 Missing Values True
6 Numeric Features 9
7 Categorical Features 30
8 Ordinal Features True
9 High Cardinality Features False
10 High Cardinality Method None
11 Transformed Train Set (1449, 62)
12 Transformed Test Set (382, 62)
13 Shuffle Train-Test True
14 Stratify Train-Test True
15 Fold Generator StratifiedKFold
16 Fold Number 10
17 CPU Jobs -1
18 Use GPU True
19 Log Experiment True
20 Experiment Name first_exp
21 USI 2565
22 Imputation Type simple
23 Iterative Imputation Iteration None
24 Numeric Imputer mean
25 Iterative Imputation Numeric Model None
26 Categorical Imputer constant
27 Iterative Imputation Categorical Model None
28 Unknown Categoricals Handling most_frequent
29 Normalize True
30 Normalize Method zscore
31 Transformation False
32 Transformation Method None
33 PCA False
34 PCA Method None
35 PCA Components None
36 Ignore Low Variance False
37 Combine Rare Levels True
38 Rare Level Threshold 0.100000
39 Numeric Binning False
40 Remove Outliers True
41 Outliers Threshold 0.050000
42 Remove Multicollinearity True
43 Multicollinearity Threshold 0.900000
44 Remove Perfect Collinearity True
45 Clustering False
46 Clustering Iteration None
47 Polynomial Features False
48 Polynomial Degree None
49 Trignometry Features False
50 Polynomial Threshold None
51 Group Features False
52 Feature Selection False
53 Feature Selection Method classic
54 Features Selection Threshold None
55 Feature Interaction False
56 Feature Ratio False
57 Interaction Threshold None
58 Fix Imbalance True
59 Fix Imbalance Method SMOTE
In [61]:
best_models = compare_models(turbo=False, sort='auc', round=3)
  Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
gbc Gradient Boosting Classifier 0.540 0.788 0.539 0.541 0.538 0.383 0.384 1.861
lightgbm Light Gradient Boosting Machine 0.539 0.787 0.537 0.541 0.538 0.382 0.383 16.177
catboost CatBoost Classifier 0.524 0.777 0.523 0.525 0.522 0.364 0.365 8.267
mlp MLP Classifier 0.508 0.753 0.507 0.513 0.508 0.342 0.343 54.289
rbfsvm SVM - Radial Kernel 0.481 0.739 0.480 0.483 0.480 0.306 0.307 1.281
ada Ada Boost Classifier 0.472 0.731 0.476 0.480 0.471 0.297 0.299 0.263
lr Logistic Regression 0.483 0.729 0.484 0.485 0.481 0.310 0.311 13.012
lda Linear Discriminant Analysis 0.475 0.729 0.476 0.477 0.474 0.300 0.301 0.275
rf Random Forest Classifier 0.461 0.726 0.458 0.462 0.459 0.278 0.279 0.903
et Extra Trees Classifier 0.433 0.701 0.429 0.432 0.431 0.241 0.241 1.431
qda Quadratic Discriminant Analysis 0.319 0.681 0.285 0.268 0.189 0.049 0.116 0.338
nb Naive Bayes 0.300 0.647 0.307 0.390 0.204 0.077 0.144 0.092
knn K Neighbors Classifier 0.375 0.627 0.378 0.385 0.371 0.169 0.172 0.480
dt Decision Tree Classifier 0.431 0.620 0.430 0.435 0.430 0.240 0.241 0.076
gpc Gaussian Process Classifier 0.366 0.588 0.365 0.370 0.367 0.154 0.154 18.088
dummy Dummy Classifier 0.258 0.500 0.250 0.067 0.106 0.000 0.000 0.113
svm SVM - Linear Kernel 0.415 0.000 0.413 0.445 0.396 0.218 0.228 0.118
ridge Ridge Classifier 0.481 0.000 0.482 0.482 0.478 0.308 0.309 0.167
In [ ]:
 

lightGBM¶

In [63]:
# Best ML analysis
lightgbm = create_model(estimator='lightgbm', fit_kwargs={'verbose': -1})
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.5724 0.8014 0.5724 0.5759 0.5739 0.4277 0.4278
1 0.5448 0.7721 0.5445 0.5446 0.5437 0.3896 0.3901
2 0.5724 0.8090 0.5707 0.5734 0.5702 0.4264 0.4281
3 0.4552 0.7338 0.4547 0.4466 0.4483 0.2702 0.2715
4 0.5379 0.7648 0.5344 0.5356 0.5354 0.3808 0.3815
5 0.5517 0.8131 0.5427 0.5466 0.5477 0.3990 0.3998
6 0.4966 0.7705 0.4905 0.5074 0.4996 0.3244 0.3255
7 0.6000 0.8264 0.5945 0.6092 0.6018 0.4628 0.4640
8 0.5241 0.7859 0.5291 0.5251 0.5217 0.3629 0.3645
9 0.5347 0.7884 0.5339 0.5433 0.5347 0.3786 0.3806
Mean 0.5390 0.7865 0.5367 0.5408 0.5377 0.3822 0.3833
Std 0.0390 0.0260 0.0385 0.0416 0.0404 0.0521 0.0520
In [9]:
# Plotting the classification report
plot_model(estimator=lightgbm, plot='class_report', use_train_data=False)
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
In [10]:
plot_model(lightgbm,
           plot='confusion_matrix',
           plot_kwargs={'percent': True},
           use_train_data=False)
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
In [12]:
# Plotting the AUC
plot_model(estimator=lightgbm, plot='auc', use_train_data=False)
In [16]:
plot_model(estimator=lightgbm, plot='error')
In [17]:
interpret_model(estimator=lightgbm)

# ValueError: type parameter only accepts 'summary', 'correlation', 'reason', 'pdp', 'msa' or 'pfi'.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif

Keep in mind the label codes:¶

  • moon_trip: 0,
  • not_travel: 1,
  • orbital: 2,
  • suborbital: 3
In [ ]:
 

Tuned lightgbm¶

In [66]:
# tune hyperparameters to optimize AUC
tuned_lightgbm = tune_model(lightgbm,
                            optimize='AUC',
                            n_iter=80,
                            fit_kwargs={'verbose': -1},
                            early_stopping=True,
                            choose_better=True)
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.5655 0.8056 0.5681 0.5801 0.5704 0.4191 0.4201
1 0.5379 0.7654 0.5356 0.5393 0.5382 0.3809 0.3811
2 0.5517 0.7944 0.5536 0.5514 0.5503 0.3995 0.4002
3 0.5034 0.7561 0.5037 0.5064 0.5033 0.3343 0.3351
4 0.5241 0.7708 0.5217 0.5200 0.5207 0.3626 0.3632
5 0.5655 0.8113 0.5545 0.5527 0.5565 0.4164 0.4179
6 0.5793 0.7983 0.5782 0.5825 0.5797 0.4350 0.4356
7 0.6000 0.8320 0.5966 0.6014 0.5994 0.4631 0.4638
8 0.5034 0.7711 0.5055 0.4978 0.4972 0.3365 0.3381
9 0.5694 0.7907 0.5705 0.5864 0.5713 0.4258 0.4292
Mean 0.5500 0.7896 0.5488 0.5518 0.5487 0.3973 0.3984
Std 0.0306 0.0224 0.0298 0.0340 0.0319 0.0407 0.0408
In [20]:
# Plotting the classification report
plot_model(estimator=tuned_lightgbm, plot='class_report', use_train_data=False)
In [21]:
plot_model(tuned_lightgbm,
           plot='confusion_matrix',
           plot_kwargs={'percent': True},
           use_train_data=False)
In [23]:
# Plotting the AUC
plot_model(estimator=tuned_lightgbm, plot='auc', use_train_data=False)
In [27]:
plot_model(estimator=tuned_lightgbm, plot='error')
In [28]:
interpret_model(estimator=tuned_lightgbm)

# ValueError: type parameter only accepts 'summary', 'correlation', 'reason', 'pdp', 'msa' or 'pfi'.

Keep in mind the label codes:¶

  • moon_trip: 0,
  • not_travel: 1,
  • orbital: 2,
  • suborbital: 3
In [ ]:
 

Gradient Boosting Classifier¶

In [30]:
# Best ML analysis
gbc = create_model(estimator='gbc')
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.5862 0.8132 0.5880 0.5917 0.5882 0.4457 0.4461
1 0.5379 0.7643 0.5376 0.5392 0.5373 0.3808 0.3815
2 0.6069 0.8162 0.6090 0.6139 0.6097 0.4749 0.4753
3 0.5034 0.7625 0.4971 0.4924 0.4934 0.3329 0.3354
4 0.5172 0.7778 0.5174 0.5099 0.5119 0.3539 0.3547
5 0.5586 0.8120 0.5489 0.5500 0.5529 0.4076 0.4085
6 0.5034 0.7691 0.5021 0.5077 0.5040 0.3335 0.3343
7 0.6069 0.8227 0.6008 0.6145 0.6049 0.4709 0.4735
8 0.4966 0.7615 0.5027 0.4924 0.4929 0.3268 0.3275
9 0.4792 0.7779 0.4816 0.5014 0.4821 0.3048 0.3078
Mean 0.5396 0.7877 0.5385 0.5413 0.5377 0.3832 0.3844
Std 0.0450 0.0238 0.0441 0.0466 0.0462 0.0598 0.0596
In [31]:
# Plotting the classification report
plot_model(estimator=gbc, plot='class_report', use_train_data=False)
In [32]:
plot_model(gbc,
           plot='confusion_matrix',
           plot_kwargs={'percent': True},
           use_train_data=False)
In [34]:
# Plotting the AUC
plot_model(estimator=gbc, plot='auc', use_train_data=False)
In [38]:
plot_model(estimator=gbc, plot='error')
In [39]:
interpret_model(estimator=gbc, plot='msa')

# ValueError: type parameter only accepts 'summary', 'correlation', 'reason', 'pdp', 'msa' or 'pfi'.

Tuned Gradient Boosting Classifier¶

In [41]:
# tune hyperparameters to optimize AUC
tuned_gbc = tune_model(gbc,
                       optimize='AUC',
                       n_iter=80,
                       early_stopping=True,
                       choose_better=True)
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.5379 0.7934 0.5391 0.5409 0.5385 0.3815 0.3820
1 0.5517 0.7676 0.5501 0.5531 0.5501 0.3992 0.4006
2 0.5724 0.8057 0.5749 0.5690 0.5704 0.4281 0.4282
3 0.4759 0.7324 0.4777 0.4783 0.4748 0.3004 0.3015
4 0.5034 0.7603 0.5050 0.5054 0.5019 0.3368 0.3379
5 0.5931 0.8335 0.5849 0.5864 0.5891 0.4540 0.4544
6 0.5172 0.7782 0.5181 0.5309 0.5210 0.3525 0.3538
7 0.6000 0.8278 0.5914 0.5986 0.5973 0.4626 0.4637
8 0.5172 0.7814 0.5184 0.5207 0.5102 0.3535 0.3582
9 0.5486 0.7946 0.5494 0.5646 0.5521 0.3967 0.3984
Mean 0.5418 0.7875 0.5409 0.5448 0.5405 0.3865 0.3879
Std 0.0376 0.0291 0.0348 0.0353 0.0372 0.0495 0.0491
In [42]:
# Plotting the classification report
plot_model(estimator=tuned_gbc, plot='class_report', use_train_data=False)
In [43]:
plot_model(tuned_gbc,
           plot='confusion_matrix',
           plot_kwargs={'percent': True},
           use_train_data=False)
In [45]:
# Plotting the AUC
plot_model(estimator=tuned_gbc, plot='auc', use_train_data=False)
In [49]:
plot_model(estimator=tuned_gbc, plot='error')
In [50]:
interpret_model(estimator=tuned_gbc, plot='msa')

# ValueError: type parameter only accepts 'summary', 'correlation', 'reason', 'pdp', 'msa' or 'pfi'.

Finalizing and saving model¶

In [67]:
# finalize a model
final_lightgbm = finalize_model(lightgbm)
final_tuned_lightgbm = finalize_model(tuned_lightgbm)
final_gbc = finalize_model(gbc)
final_tuned_gbc = finalize_model(tuned_gbc)
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
In [68]:
# save a model
save_model(final_lightgbm, base_dir + 'models/lightgbm_jupyter')
save_model(final_tuned_lightgbm, base_dir + 'models/tuned_lightgbm')
save_model(final_gbc, base_dir + 'models/gbc_jupyter')
save_model(final_tuned_gbc, base_dir + 'models/tuned_gbc_jupyter')
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Out[68]:
(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False,
                                       features_todrop=['city', 'state',
                                                        'generation_age',
                                                        'status_in_household'],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='choice',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_ca...
                                             learning_rate=0.1, loss='deviance',
                                             max_depth=3, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100,
                                             n_iter_no_change=None,
                                             presort='deprecated',
                                             random_state=42, subsample=1.0,
                                             tol=0.0001, validation_fraction=0.1,
                                             verbose=0, warm_start=False)]],
          verbose=False),
 '../models/tuned_gbc_jupyter.pkl')
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Rodrigo, esto es una prueba, no le pongas mucho cuidado por ahora, es como para ver la significancia del modelo frente a otros, pero realmente tanto el gbc como el lightgbm son muy similares..... solo que el lightgbm me deja presentar los resultados de impacto de variables mejor que el gbc...¶

3. Hypothesis testing¶

In [69]:
tuned_lightgbm
Out[69]:
LGBMClassifier(bagging_fraction=0.8, bagging_freq=0, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
               importance_type='split', learning_rate=0.05, max_depth=-1,
               min_child_samples=31, min_child_weight=0.001, min_split_gain=0.3,
               n_estimators=130, n_jobs=-1, num_leaves=80, objective=None,
               random_state=42, reg_alpha=2, reg_lambda=0.4, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
In [70]:
X = get_config('X')
y = get_config('y')
cv1 = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
In [71]:
labels = ['gbc', 'tuned_gbc', 'lightgbm', 'tuned_lightgbm']
models = [gbc, tuned_gbc, lightgbm, tuned_lightgbm]
In [72]:
for i, j in zip(labels, models):
    score = cross_val_score(j, X, y, scoring='accuracy', cv=cv1, n_jobs=-1)
    print(i + '_score: %.2f%% +/-(%.3f)' %
          (np.mean(score * 100), np.std(score)))
gbc_score: 53.51% +/-(0.036)
tuned_gbc_score: 53.51% +/-(0.036)
lightgbm_score: 55.90% +/-(0.027)
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8

[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0

[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
tuned_lightgbm_score: 54.52% +/-(0.032)
In [73]:
for i, j in zip(labels, models):
    score = cross_val_score(j,
                            X,
                            y,
                            scoring='roc_auc_ovr_weighted',
                            cv=cv1,
                            n_jobs=-1)
    print(i + '_score: %.2f%% +/-(%.3f)' %
          (np.mean(score * 100), np.std(score)))
gbc_score: 78.58% +/-(0.021)
tuned_gbc_score: 78.58% +/-(0.021)
lightgbm_score: 79.94% +/-(0.019)
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0

[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
tuned_lightgbm_score: 79.22% +/-(0.023)
In [74]:
for i, j in zip(labels, models):
    score = cross_val_score(j,
                            X,
                            y,
                            scoring='roc_auc_ovo_weighted',
                            cv=cv1,
                            n_jobs=-1)
    print(i + '_score: %.2f%% +/-(%.3f)' %
          (np.mean(score * 100), np.std(score)))
gbc_score: 78.77% +/-(0.021)
tuned_gbc_score: 78.77% +/-(0.021)
lightgbm_score: 80.12% +/-(0.019)
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
tuned_lightgbm_score: 79.40% +/-(0.022)
In [75]:
for i, j in zip(labels, models):
    score = cross_val_score(j, X, y, scoring='f1_weighted', cv=cv1, n_jobs=-1)
    print(i + '_score: %.2f%% +/-(%.3f)' %
          (np.mean(score * 100), np.std(score)))
gbc_score: 53.33% +/-(0.035)
tuned_gbc_score: 53.33% +/-(0.035)
lightgbm_score: 55.84% +/-(0.026)
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0

[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_freq is set=0, subsample_freq=0 will be ignored. Current value: bagging_freq=0
tuned_lightgbm_score: 54.21% +/-(0.030)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [77]:
!jupyter nbconvert --to html eda_v2.ipynb  
[NbConvertApp] Converting notebook eda_v2.ipynb to html
[NbConvertApp] WARNING | Non-unique cell id 'e353102c' detected. Corrected to '05f8e080'.
[NbConvertApp] WARNING | Non-unique cell id 'f6925884-c986-4e6c-9d3b-a09c6f62d269' detected. Corrected to '25cb6701'.
[NbConvertApp] WARNING | Non-unique cell id 'b83a91f0-0294-4853-af22-773685e5626f' detected. Corrected to 'f44bc7ae'.
[NbConvertApp] WARNING | Non-unique cell id '574e711d-4494-4e3b-9861-bf2c506425d2' detected. Corrected to 'b0aaf74c'.
[NbConvertApp] WARNING | Non-unique cell id '9d2acf4f-45a6-43cb-b0bd-c29fe587010a' detected. Corrected to 'dc31989b'.
[NbConvertApp] WARNING | Non-unique cell id '0080127b-75f3-4635-98b5-d6be1d9a8347' detected. Corrected to '4de6558a'.
[NbConvertApp] WARNING | Non-unique cell id 'c995e0c7' detected. Corrected to '8055c5ff'.
[NbConvertApp] WARNING | Non-unique cell id 'c22185cd' detected. Corrected to '986ab5e6'.
[NbConvertApp] WARNING | Non-unique cell id 'f6925884-c986-4e6c-9d3b-a09c6f62d269' detected. Corrected to '7c5eaa53'.
[NbConvertApp] WARNING | Non-unique cell id '677894bb-d02e-44cf-804f-7e73e1bb7005' detected. Corrected to '417cacf0'.
[NbConvertApp] WARNING | Non-unique cell id 'b83a91f0-0294-4853-af22-773685e5626f' detected. Corrected to '5d3ab0bc'.
[NbConvertApp] WARNING | Non-unique cell id '574e711d-4494-4e3b-9861-bf2c506425d2' detected. Corrected to '6c10aab2'.
[NbConvertApp] WARNING | Non-unique cell id '9d2acf4f-45a6-43cb-b0bd-c29fe587010a' detected. Corrected to '6a397fb5'.
[NbConvertApp] WARNING | Non-unique cell id '0080127b-75f3-4635-98b5-d6be1d9a8347' detected. Corrected to 'eec614c9'.
[NbConvertApp] WARNING | Non-unique cell id 'c995e0c7' detected. Corrected to 'bc0d9f55'.
[NbConvertApp] WARNING | Non-unique cell id 'f6925884-c986-4e6c-9d3b-a09c6f62d269' detected. Corrected to '7e9e9a3a'.
[NbConvertApp] WARNING | Non-unique cell id '16733958' detected. Corrected to 'b1e88a30'.
[NbConvertApp] WARNING | Non-unique cell id 'b83a91f0-0294-4853-af22-773685e5626f' detected. Corrected to '948c9952'.
[NbConvertApp] WARNING | Non-unique cell id '574e711d-4494-4e3b-9861-bf2c506425d2' detected. Corrected to 'aa403d74'.
[NbConvertApp] WARNING | Non-unique cell id '9d2acf4f-45a6-43cb-b0bd-c29fe587010a' detected. Corrected to '24a87f42'.
[NbConvertApp] WARNING | Non-unique cell id '0080127b-75f3-4635-98b5-d6be1d9a8347' detected. Corrected to '323f82b0'.
[NbConvertApp] WARNING | Non-unique cell id 'c995e0c7' detected. Corrected to '98550f58'.
[NbConvertApp] WARNING | Non-unique cell id 'caa89228' detected. Corrected to '152daf8e'.
[NbConvertApp] Writing 5485524 bytes to eda_v2.html
In [ ]: